source("data_exploration_cleaning.R")
PLU 4046 - small Hass PLU 4225 - large Hass PLU 4770 - Extra large Hass
Convert Sizes into percentage of total sales
Convert Bag numbers into percentage sold in small bags
Convert Date into season
Convert convential/organic into Organic - true false
Regions - extract the regions which equal totalUS figures
Convert year to factor
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(modelr)
alias_avocado <- lm(average_price ~ ., data = avocados_cleaned)
alias(alias_avocado)
## Model :
## average_price ~ total_volume + year + region + season + small_hass_percent +
## large_hass_percent + xl_hass_percent + non_hass_percent +
## small_bag_percent + organic
##
## Complete :
## (Intercept) total_volume year2016 year2017 year2018
## non_hass_percent 100 0 0 0 0
## regionGreatLakes regionMidsouth regionNortheast regionPlains
## non_hass_percent 0 0 0 0
## regionSouthCentral regionSoutheast regionWest seasonspring
## non_hass_percent 0 0 0 0
## seasonsummer seasonwinter small_hass_percent
## non_hass_percent 0 0 -1
## large_hass_percent xl_hass_percent small_bag_percent
## non_hass_percent -1 -1 0
## organicTRUE
## non_hass_percent 0
avocados_cleaned <- avocados_cleaned %>%
select(-non_hass_percent)
avocados_numeric <- avocados_cleaned %>%
select_if(is.numeric)
avocados_non_numeric <- avocados_cleaned %>%
select_if(function(x) !is.numeric(x))
avocados_non_numeric$average_price <- avocados_cleaned$average_price
summary(prcomp(avocados_numeric, center = T, scale. = T))
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.51 1.2022 1.0052 0.8316 0.56643 0.50211
## Proportion of Variance 0.38 0.2409 0.1684 0.1153 0.05347 0.04202
## Cumulative Proportion 0.38 0.6209 0.7893 0.9045 0.95798 1.00000
ggpairs(avocados_numeric, upper = list(continuous = wrap("cor", family="sans", size = 2)))
ggpairs(avocados_non_numeric, upper = list(continuous = wrap("cor", family="sans", size = 2)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Most movement to least (estimate): Organic, Region, season, year
1st predictors to test: total_volume, small_hass_percent, organic, region
model_1a <- lm(average_price ~ total_volume, data = avocados_cleaned)
model_1b <- lm(average_price ~ small_hass_percent, data = avocados_cleaned)
model_1c <- lm(average_price ~ organic, data = avocados_cleaned)
model_1d <- lm(average_price ~ region, data = avocados_cleaned)
summary(model_1a)
##
## Call:
## lm(formula = average_price ~ total_volume, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7336 -0.1692 -0.0108 0.1471 0.9908
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.597e+00 6.273e-03 254.63 <2e-16 ***
## total_volume -1.054e-07 1.941e-09 -54.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2418 on 2702 degrees of freedom
## Multiple R-squared: 0.5219, Adjusted R-squared: 0.5217
## F-statistic: 2949 on 1 and 2702 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_1a)
summary(model_1b)
##
## Call:
## lm(formula = average_price ~ small_hass_percent, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.76188 -0.23039 -0.02244 0.19097 1.17501
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.6050637 0.0096102 167.02 <2e-16 ***
## small_hass_percent -0.0093481 0.0003033 -30.82 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3008 on 2702 degrees of freedom
## Multiple R-squared: 0.2601, Adjusted R-squared: 0.2598
## F-statistic: 949.7 on 1 and 2702 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_1b)
summary(model_1c)
##
## Call:
## lm(formula = average_price ~ organic, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.77939 -0.16774 -0.01774 0.16061 0.97061
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.127744 0.006894 163.6 <2e-16 ***
## organicTRUE 0.481649 0.009749 49.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2535 on 2702 degrees of freedom
## Multiple R-squared: 0.4746, Adjusted R-squared: 0.4744
## F-statistic: 2441 on 1 and 2702 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_1c)
## hat values (leverages) are all = 0.000739645
## and there are no factor predictors; no plot no. 5
summary(model_1d)
##
## Call:
## lm(formula = average_price ~ region, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.77802 -0.27490 -0.00533 0.24524 1.24778
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.395325 0.017588 79.333 < 2e-16 ***
## regionGreatLakes -0.056775 0.024873 -2.283 0.0225 *
## regionMidsouth 0.009438 0.024873 0.379 0.7044
## regionNortheast 0.206598 0.024873 8.306 < 2e-16 ***
## regionPlains 0.041183 0.024873 1.656 0.0979 .
## regionSouthCentral -0.294083 0.024873 -11.823 < 2e-16 ***
## regionSoutheast 0.002692 0.024873 0.108 0.9138
## regionWest -0.123107 0.024873 -4.949 7.91e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3234 on 2696 degrees of freedom
## Multiple R-squared: 0.1469, Adjusted R-squared: 0.1447
## F-statistic: 66.32 on 7 and 2696 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_1d)
## hat values (leverages) are all = 0.00295858
## and there are no factor predictors; no plot no. 5
avocados_resid <- avocados_cleaned %>%
add_residuals(model_1a) %>%
select(-average_price, -total_volume)
ggpairs(avocados_resid, upper = list(continuous = wrap("cor", family="sans", size = 2)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
model_2a <- lm(average_price ~ total_volume + small_hass_percent, data = avocados_cleaned)
model_2b <- lm(average_price ~ total_volume + region, data = avocados_cleaned)
model_2c <- lm(average_price ~ total_volume + year, data = avocados_cleaned)
model_2d <- lm(average_price ~ total_volume + season, data = avocados_cleaned)
summary(model_2a)
##
## Call:
## lm(formula = average_price ~ total_volume + small_hass_percent,
## data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.80364 -0.15228 -0.01241 0.13800 0.98808
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.668e+00 7.546e-03 221.09 <2e-16 ***
## total_volume -9.031e-08 2.097e-09 -43.06 <2e-16 ***
## small_hass_percent -4.102e-03 2.635e-04 -15.57 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2317 on 2701 degrees of freedom
## Multiple R-squared: 0.5613, Adjusted R-squared: 0.5609
## F-statistic: 1728 on 2 and 2701 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_2a)
summary(model_2b)
##
## Call:
## lm(formula = average_price ~ total_volume + region, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67412 -0.14399 -0.00593 0.12989 0.91701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.723e+00 1.276e-02 135.084 < 2e-16 ***
## total_volume -1.077e-07 1.798e-09 -59.892 < 2e-16 ***
## regionGreatLakes -1.967e-01 1.646e-02 -11.951 < 2e-16 ***
## regionMidsouth -1.564e-01 1.653e-02 -9.464 < 2e-16 ***
## regionNortheast 1.060e-01 1.638e-02 6.472 1.14e-10 ***
## regionPlains -1.875e-01 1.674e-02 -11.203 < 2e-16 ***
## regionSouthCentral -2.997e-01 1.629e-02 -18.394 < 2e-16 ***
## regionSoutheast -1.291e-01 1.644e-02 -7.852 5.84e-15 ***
## regionWest -1.047e-01 1.630e-02 -6.424 1.57e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2118 on 2695 degrees of freedom
## Multiple R-squared: 0.634, Adjusted R-squared: 0.6329
## F-statistic: 583.6 on 8 and 2695 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_2b)
summary(model_2c)
##
## Call:
## lm(formula = average_price ~ total_volume + year, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.83794 -0.14728 -0.01527 0.13855 1.05298
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.554e+00 8.820e-03 176.161 < 2e-16 ***
## total_volume -1.060e-07 1.859e-09 -57.053 < 2e-16 ***
## year2016 -1.861e-02 1.133e-02 -1.643 0.10054
## year2017 1.481e-01 1.127e-02 13.135 < 2e-16 ***
## year2018 5.881e-02 1.854e-02 3.172 0.00153 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2309 on 2699 degrees of freedom
## Multiple R-squared: 0.5645, Adjusted R-squared: 0.5639
## F-statistic: 874.7 on 4 and 2699 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_2c)
summary(model_2d)
##
## Call:
## lm(formula = average_price ~ total_volume + season, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67947 -0.14570 -0.01234 0.14481 0.90127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.696e+00 9.741e-03 174.065 < 2e-16 ***
## total_volume -1.029e-07 1.845e-09 -55.775 < 2e-16 ***
## seasonspring -1.532e-01 1.263e-02 -12.135 < 2e-16 ***
## seasonsummer -4.431e-02 1.298e-02 -3.413 0.000653 ***
## seasonwinter -1.928e-01 1.244e-02 -15.501 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.229 on 2699 degrees of freedom
## Multiple R-squared: 0.5718, Adjusted R-squared: 0.5712
## F-statistic: 901.2 on 4 and 2699 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_2d)
avocados_resid <- avocados_cleaned %>%
add_residuals(model_2b) %>%
select(-average_price, -total_volume, -region)
ggpairs(avocados_resid, upper = list(continuous = wrap("cor", family="sans", size = 2)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
model_3a <- lm(average_price ~ total_volume + region + season, data = avocados_cleaned)
model_3b <- lm(average_price ~ total_volume + region + year, data = avocados_cleaned)
model_3c <- lm(average_price ~ total_volume + region + organic, data = avocados_cleaned)
model_3d <- lm(average_price ~ total_volume + region + total_volume:region, data = avocados_cleaned)
summary(model_3a)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season,
## data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.61244 -0.12545 -0.00437 0.12419 0.85723
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.818e+00 1.352e-02 134.418 < 2e-16 ***
## total_volume -1.049e-07 1.680e-09 -62.429 < 2e-16 ***
## regionGreatLakes -1.931e-01 1.532e-02 -12.608 < 2e-16 ***
## regionMidsouth -1.522e-01 1.538e-02 -9.892 < 2e-16 ***
## regionNortheast 1.086e-01 1.524e-02 7.125 1.33e-12 ***
## regionPlains -1.816e-01 1.558e-02 -11.659 < 2e-16 ***
## regionSouthCentral -2.996e-01 1.516e-02 -19.758 < 2e-16 ***
## regionSoutheast -1.257e-01 1.530e-02 -8.217 3.21e-16 ***
## regionWest -1.052e-01 1.516e-02 -6.935 5.07e-12 ***
## seasonspring -1.522e-01 1.088e-02 -13.991 < 2e-16 ***
## seasonsummer -4.349e-02 1.118e-02 -3.890 0.000103 ***
## seasonwinter -1.917e-01 1.071e-02 -17.901 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1971 on 2692 degrees of freedom
## Multiple R-squared: 0.6835, Adjusted R-squared: 0.6822
## F-statistic: 528.5 on 11 and 2692 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_3a)
summary(model_3b)
##
## Call:
## lm(formula = average_price ~ total_volume + region + year, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.77451 -0.12677 -0.00241 0.12842 0.92636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.680e+00 1.319e-02 127.354 < 2e-16 ***
## total_volume -1.084e-07 1.696e-09 -63.922 < 2e-16 ***
## regionGreatLakes -1.977e-01 1.548e-02 -12.770 < 2e-16 ***
## regionMidsouth -1.575e-01 1.554e-02 -10.136 < 2e-16 ***
## regionNortheast 1.054e-01 1.540e-02 6.840 9.79e-12 ***
## regionPlains -1.890e-01 1.574e-02 -12.010 < 2e-16 ***
## regionSouthCentral -2.998e-01 1.532e-02 -19.565 < 2e-16 ***
## regionSoutheast -1.300e-01 1.546e-02 -8.408 < 2e-16 ***
## regionWest -1.046e-01 1.532e-02 -6.824 1.09e-11 ***
## year2016 -1.815e-02 9.771e-03 -1.857 0.063359 .
## year2017 1.486e-01 9.726e-03 15.276 < 2e-16 ***
## year2018 6.054e-02 1.600e-02 3.785 0.000157 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1992 on 2692 degrees of freedom
## Multiple R-squared: 0.6768, Adjusted R-squared: 0.6755
## F-statistic: 512.5 on 11 and 2692 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_3b)
summary(model_3c)
##
## Call:
## lm(formula = average_price ~ total_volume + region + organic,
## data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.70990 -0.13747 -0.01785 0.11719 0.94201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.478e+00 2.402e-02 61.517 < 2e-16 ***
## total_volume -6.343e-08 4.096e-09 -15.488 < 2e-16 ***
## regionGreatLakes -1.392e-01 1.675e-02 -8.312 < 2e-16 ***
## regionMidsouth -8.827e-02 1.709e-02 -5.165 2.58e-07 ***
## regionNortheast 1.474e-01 1.634e-02 9.020 < 2e-16 ***
## regionPlains -9.352e-02 1.811e-02 -5.165 2.58e-07 ***
## regionSouthCentral -2.974e-01 1.588e-02 -18.724 < 2e-16 ***
## regionSoutheast -7.495e-02 1.665e-02 -4.501 7.07e-06 ***
## regionWest -1.123e-01 1.590e-02 -7.061 2.09e-12 ***
## organicTRUE 2.218e-01 1.856e-02 11.952 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2065 on 2694 degrees of freedom
## Multiple R-squared: 0.6524, Adjusted R-squared: 0.6513
## F-statistic: 561.9 on 9 and 2694 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_3c)
summary(model_3d)
##
## Call:
## lm(formula = average_price ~ total_volume + region + total_volume:region,
## data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.65689 -0.12925 -0.01979 0.11513 0.95087
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.708e+00 1.530e-02 111.685 < 2e-16 ***
## total_volume -1.028e-07 3.574e-09 -28.761 < 2e-16 ***
## regionGreatLakes -1.882e-01 2.174e-02 -8.657 < 2e-16 ***
## regionMidsouth -9.472e-02 2.178e-02 -4.350 1.41e-05 ***
## regionNortheast 1.653e-01 2.173e-02 7.606 3.87e-14 ***
## regionPlains 1.130e-02 2.160e-02 0.523 0.600853
## regionSouthCentral -3.665e-01 2.161e-02 -16.963 < 2e-16 ***
## regionSoutheast -7.264e-02 2.139e-02 -3.396 0.000693 ***
## regionWest -1.253e-01 2.175e-02 -5.758 9.46e-09 ***
## total_volume:regionGreatLakes -1.267e-09 7.292e-09 -0.174 0.862056
## total_volume:regionMidsouth -3.602e-08 8.239e-09 -4.372 1.28e-05 ***
## total_volume:regionNortheast -2.591e-08 6.348e-09 -4.082 4.60e-05 ***
## total_volume:regionPlains -2.046e-07 1.228e-08 -16.663 < 2e-16 ***
## total_volume:regionSouthCentral 2.240e-08 5.087e-09 4.404 1.10e-05 ***
## total_volume:regionSoutheast -2.774e-08 6.735e-09 -4.119 3.92e-05 ***
## total_volume:regionWest 6.133e-09 4.973e-09 1.233 0.217549
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1976 on 2688 degrees of freedom
## Multiple R-squared: 0.6822, Adjusted R-squared: 0.6805
## F-statistic: 384.7 on 15 and 2688 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_3d)
avocados_resid <- avocados_cleaned %>%
add_residuals(model_2b) %>%
select(-average_price, -total_volume, -region, -season)
ggpairs(avocados_resid, upper = list(continuous = wrap("cor", family="sans", size = 2)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
model_3a: average_price ~ total_volume + region + season
interactions: * total_volume:region * total_volume:season * region:season
avocados_int_resid <- avocados_cleaned %>%
add_residuals(model_3a) %>%
select(-average_price)
avocados_int_resid %>%
ggplot(aes(x = total_volume, y = resid, colour = region)) +
geom_point() +
geom_smooth(method = "lm", se = F)
## `geom_smooth()` using formula 'y ~ x'
avocados_int_resid %>%
ggplot(aes(x = total_volume, y = resid, colour = season)) +
geom_point() +
geom_smooth(method = "lm", se = F)
## `geom_smooth()` using formula 'y ~ x'
avocados_int_resid %>%
ggplot(aes(x = region, y = resid, colour = season)) +
geom_boxplot() +
geom_smooth(method = "lm", se = F)
## `geom_smooth()` using formula 'y ~ x'
model_4a <- lm(average_price ~ total_volume + region + season + total_volume:region, data = avocados_cleaned)
model_4b <- lm(average_price ~ total_volume + region + season + total_volume:season, data = avocados_cleaned)
model_4c <- lm(average_price ~ total_volume + region + season + region:season, data = avocados_cleaned)
model_4d <- lm(average_price ~ total_volume + region + season + organic, data = avocados_cleaned)
model_4e <- lm(average_price ~ total_volume + region + season + year, data = avocados_cleaned)
summary(model_4a)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season +
## total_volume:region, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.60375 -0.11164 -0.01438 0.10870 0.88893
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.800e+00 1.537e-02 117.077 < 2e-16 ***
## total_volume -1.005e-07 3.302e-09 -30.447 < 2e-16 ***
## regionGreatLakes -1.883e-01 2.007e-02 -9.384 < 2e-16 ***
## regionMidsouth -9.353e-02 2.010e-02 -4.652 3.44e-06 ***
## regionNortheast 1.670e-01 2.006e-02 8.324 < 2e-16 ***
## regionPlains 1.169e-02 1.994e-02 0.586 0.557838
## regionSouthCentral -3.651e-01 1.995e-02 -18.302 < 2e-16 ***
## regionSoutheast -7.204e-02 1.975e-02 -3.648 0.000269 ***
## regionWest -1.266e-01 2.008e-02 -6.306 3.33e-10 ***
## seasonspring -1.457e-01 1.007e-02 -14.468 < 2e-16 ***
## seasonsummer -3.742e-02 1.035e-02 -3.614 0.000307 ***
## seasonwinter -1.861e-01 9.919e-03 -18.763 < 2e-16 ***
## total_volume:regionGreatLakes 4.994e-10 6.733e-09 0.074 0.940877
## total_volume:regionMidsouth -3.449e-08 7.607e-09 -4.534 6.03e-06 ***
## total_volume:regionNortheast -2.571e-08 5.861e-09 -4.387 1.19e-05 ***
## total_volume:regionPlains -1.998e-07 1.134e-08 -17.617 < 2e-16 ***
## total_volume:regionSouthCentral 2.196e-08 4.696e-09 4.677 3.06e-06 ***
## total_volume:regionSoutheast -2.655e-08 6.218e-09 -4.269 2.03e-05 ***
## total_volume:regionWest 6.442e-09 4.591e-09 1.403 0.160658
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1825 on 2685 degrees of freedom
## Multiple R-squared: 0.7295, Adjusted R-squared: 0.7277
## F-statistic: 402.3 on 18 and 2685 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_4a)
summary(model_4b)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season +
## total_volume:season, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59115 -0.12614 -0.00355 0.12496 0.84755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.873e+00 1.494e-02 125.315 < 2e-16 ***
## total_volume -1.349e-07 4.054e-09 -33.279 < 2e-16 ***
## regionGreatLakes -1.953e-01 1.511e-02 -12.927 < 2e-16 ***
## regionMidsouth -1.544e-01 1.517e-02 -10.176 < 2e-16 ***
## regionNortheast 1.077e-01 1.503e-02 7.163 1.01e-12 ***
## regionPlains -1.853e-01 1.537e-02 -12.057 < 2e-16 ***
## regionSouthCentral -2.992e-01 1.495e-02 -20.011 < 2e-16 ***
## regionSoutheast -1.277e-01 1.509e-02 -8.460 < 2e-16 ***
## regionWest -1.057e-01 1.495e-02 -7.067 2.01e-12 ***
## seasonspring -2.333e-01 1.453e-02 -16.055 < 2e-16 ***
## seasonsummer -8.761e-02 1.498e-02 -5.849 5.55e-09 ***
## seasonwinter -2.593e-01 1.420e-02 -18.261 < 2e-16 ***
## total_volume:seasonspring 4.184e-08 4.949e-09 8.454 < 2e-16 ***
## total_volume:seasonsummer 2.568e-08 5.177e-09 4.961 7.45e-07 ***
## total_volume:seasonwinter 3.603e-08 4.843e-09 7.440 1.35e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1944 on 2689 degrees of freedom
## Multiple R-squared: 0.6926, Adjusted R-squared: 0.691
## F-statistic: 432.7 on 14 and 2689 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_4b)
summary(model_4c)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season +
## region:season, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62127 -0.13061 0.00244 0.12166 0.78782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.888e+00 2.228e-02 84.738 < 2e-16 ***
## total_volume -1.046e-07 1.651e-09 -63.369 < 2e-16 ***
## regionGreatLakes -2.655e-01 3.103e-02 -8.559 < 2e-16 ***
## regionMidsouth -2.377e-01 3.104e-02 -7.657 2.64e-14 ***
## regionNortheast -7.954e-02 3.100e-02 -2.566 0.010341 *
## regionPlains -2.288e-01 3.111e-02 -7.354 2.55e-13 ***
## regionSouthCentral -3.851e-01 3.098e-02 -12.434 < 2e-16 ***
## regionSoutheast -1.804e-01 3.102e-02 -5.815 6.80e-09 ***
## regionWest -1.411e-01 3.098e-02 -4.555 5.48e-06 ***
## seasonspring -2.720e-01 3.010e-02 -9.035 < 2e-16 ***
## seasonsummer -7.778e-02 3.099e-02 -2.510 0.012143 *
## seasonwinter -3.084e-01 2.966e-02 -10.399 < 2e-16 ***
## regionGreatLakes:seasonspring 1.339e-01 4.255e-02 3.147 0.001669 **
## regionMidsouth:seasonspring 1.418e-01 4.255e-02 3.333 0.000871 ***
## regionNortheast:seasonspring 3.099e-01 4.255e-02 7.284 4.23e-13 ***
## regionPlains:seasonspring 9.446e-02 4.255e-02 2.220 0.026504 *
## regionSouthCentral:seasonspring 1.458e-01 4.254e-02 3.427 0.000620 ***
## regionSoutheast:seasonspring 1.268e-01 4.255e-02 2.980 0.002904 **
## regionWest:seasonspring 4.676e-03 4.254e-02 0.110 0.912497
## regionGreatLakes:seasonsummer 1.567e-02 4.381e-02 0.358 0.720530
## regionMidsouth:seasonsummer 3.330e-02 4.381e-02 0.760 0.447251
## regionNortheast:seasonsummer 1.717e-01 4.381e-02 3.920 9.06e-05 ***
## regionPlains:seasonsummer -2.590e-03 4.381e-02 -0.059 0.952867
## regionSouthCentral:seasonsummer -7.666e-03 4.381e-02 -0.175 0.861097
## regionSoutheast:seasonsummer -2.190e-02 4.381e-02 -0.500 0.617158
## regionWest:seasonsummer 8.474e-02 4.381e-02 1.934 0.053159 .
## regionGreatLakes:seasonwinter 1.235e-01 4.190e-02 2.947 0.003233 **
## regionMidsouth:seasonwinter 1.489e-01 4.191e-02 3.553 0.000388 ***
## regionNortheast:seasonwinter 2.449e-01 4.191e-02 5.845 5.68e-09 ***
## regionPlains:seasonwinter 8.579e-02 4.191e-02 2.047 0.040761 *
## regionSouthCentral:seasonwinter 1.776e-01 4.190e-02 4.239 2.32e-05 ***
## regionSoutheast:seasonwinter 9.728e-02 4.191e-02 2.321 0.020342 *
## regionWest:seasonwinter 5.429e-02 4.190e-02 1.296 0.195222
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1934 on 2671 degrees of freedom
## Multiple R-squared: 0.6975, Adjusted R-squared: 0.6939
## F-statistic: 192.5 on 32 and 2671 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_4c)
summary(model_4d)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season +
## organic, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.66258 -0.11332 -0.00396 0.10760 0.89333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.511e+00 2.193e-02 68.905 < 2e-16 ***
## total_volume -4.539e-08 3.806e-09 -11.927 < 2e-16 ***
## regionGreatLakes -1.158e-01 1.522e-02 -7.608 3.84e-14 ***
## regionMidsouth -6.048e-02 1.554e-02 -3.892 0.000102 ***
## regionNortheast 1.642e-01 1.482e-02 11.076 < 2e-16 ***
## regionPlains -5.521e-02 1.651e-02 -3.345 0.000834 ***
## regionSouthCentral -2.965e-01 1.439e-02 -20.597 < 2e-16 ***
## regionSoutheast -5.287e-02 1.513e-02 -3.495 0.000481 ***
## regionWest -1.153e-01 1.441e-02 -8.006 1.74e-15 ***
## seasonspring -1.842e-01 1.049e-02 -17.563 < 2e-16 ***
## seasonsummer -6.802e-02 1.071e-02 -6.352 2.48e-10 ***
## seasonwinter -2.230e-01 1.033e-02 -21.594 < 2e-16 ***
## organicTRUE 2.957e-01 1.717e-02 17.224 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1871 on 2691 degrees of freedom
## Multiple R-squared: 0.7149, Adjusted R-squared: 0.7136
## F-statistic: 562.4 on 12 and 2691 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_4d)
summary(model_4e)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season +
## year, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71390 -0.10783 0.00554 0.10804 0.82929
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.776e+00 1.339e-02 132.671 < 2e-16 ***
## total_volume -1.061e-07 1.549e-09 -68.521 < 2e-16 ***
## regionGreatLakes -1.947e-01 1.409e-02 -13.822 < 2e-16 ***
## regionMidsouth -1.540e-01 1.414e-02 -10.888 < 2e-16 ***
## regionNortheast 1.075e-01 1.402e-02 7.669 2.41e-14 ***
## regionPlains -1.842e-01 1.432e-02 -12.856 < 2e-16 ***
## regionSouthCentral -2.996e-01 1.394e-02 -21.492 < 2e-16 ***
## regionSoutheast -1.272e-01 1.407e-02 -9.040 < 2e-16 ***
## regionWest -1.050e-01 1.394e-02 -7.527 7.02e-14 ***
## seasonspring -1.602e-01 1.008e-02 -15.894 < 2e-16 ***
## seasonsummer -4.299e-02 1.028e-02 -4.182 2.98e-05 ***
## seasonwinter -2.124e-01 1.013e-02 -20.971 < 2e-16 ***
## year2016 -1.759e-02 8.893e-03 -1.978 0.048 *
## year2017 1.511e-01 8.854e-03 17.070 < 2e-16 ***
## year2018 1.510e-01 1.509e-02 10.002 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1812 on 2689 degrees of freedom
## Multiple R-squared: 0.7327, Adjusted R-squared: 0.7313
## F-statistic: 526.5 on 14 and 2689 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_4e)
#####average_price ~ total_volume + region + season + year (model_4e)
avocados_resid <- avocados_cleaned %>%
add_residuals(model_2b) %>%
select(-average_price, -total_volume, -region, -season, -year)
ggpairs(avocados_resid, upper = list(continuous = wrap("cor", family="sans", size = 2)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
model_5a <- lm(average_price ~ total_volume + region + season + year + organic, data = avocados_cleaned)
model_5b <- lm(average_price ~ total_volume + region + season + year + total_volume:region, data = avocados_cleaned)
model_5c <- lm(average_price ~ total_volume + region + season + year + small_hass_percent, data = avocados_cleaned)
model_5d <- lm(average_price ~ total_volume + region + season + year + total_volume:season, data = avocados_cleaned)
summary(model_5a)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season +
## year + organic, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.75626 -0.09694 0.00174 0.10329 0.83897
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.500e+00 2.055e-02 73.003 < 2e-16 ***
## total_volume -5.085e-08 3.546e-09 -14.341 < 2e-16 ***
## regionGreatLakes -1.229e-01 1.402e-02 -8.764 < 2e-16 ***
## regionMidsouth -6.888e-02 1.432e-02 -4.809 1.60e-06 ***
## regionNortheast 1.591e-01 1.365e-02 11.658 < 2e-16 ***
## regionPlains -6.679e-02 1.523e-02 -4.385 1.20e-05 ***
## regionSouthCentral -2.967e-01 1.324e-02 -22.410 < 2e-16 ***
## regionSoutheast -5.955e-02 1.393e-02 -4.274 1.99e-05 ***
## regionWest -1.144e-01 1.325e-02 -8.632 < 2e-16 ***
## seasonspring -1.877e-01 9.705e-03 -19.342 < 2e-16 ***
## seasonsummer -6.577e-02 9.853e-03 -6.675 2.99e-11 ***
## seasonwinter -2.367e-01 9.724e-03 -24.342 < 2e-16 ***
## year2016 -2.841e-02 8.469e-03 -3.354 0.000807 ***
## year2017 1.397e-01 8.435e-03 16.567 < 2e-16 ***
## year2018 1.170e-01 1.447e-02 8.084 9.36e-16 ***
## organicTRUE 2.734e-01 1.596e-02 17.130 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1721 on 2688 degrees of freedom
## Multiple R-squared: 0.759, Adjusted R-squared: 0.7577
## F-statistic: 564.4 on 15 and 2688 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_5a)
summary(model_5b)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season +
## year + total_volume:region, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.70586 -0.08656 0.00425 0.09299 0.84987
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.753e+00 1.460e-02 120.043 < 2e-16 ***
## total_volume -1.006e-07 2.976e-09 -33.785 < 2e-16 ***
## regionGreatLakes -1.845e-01 1.809e-02 -10.198 < 2e-16 ***
## regionMidsouth -8.784e-02 1.812e-02 -4.847 1.32e-06 ***
## regionNortheast 1.704e-01 1.808e-02 9.424 < 2e-16 ***
## regionPlains 1.557e-02 1.798e-02 0.866 0.386377
## regionSouthCentral -3.611e-01 1.798e-02 -20.080 < 2e-16 ***
## regionSoutheast -6.800e-02 1.780e-02 -3.820 0.000136 ***
## regionWest -1.246e-01 1.810e-02 -6.883 7.29e-12 ***
## seasonspring -1.542e-01 9.149e-03 -16.852 < 2e-16 ***
## seasonsummer -3.674e-02 9.334e-03 -3.937 8.47e-05 ***
## seasonwinter -2.080e-01 9.196e-03 -22.623 < 2e-16 ***
## year2016 -1.490e-02 8.072e-03 -1.846 0.065031 .
## year2017 1.545e-01 8.038e-03 19.218 < 2e-16 ***
## year2018 1.607e-01 1.371e-02 11.725 < 2e-16 ***
## total_volume:regionGreatLakes -1.731e-09 6.070e-09 -0.285 0.775481
## total_volume:regionMidsouth -3.830e-08 6.860e-09 -5.584 2.59e-08 ***
## total_volume:regionNortheast -2.736e-08 5.284e-09 -5.178 2.41e-07 ***
## total_volume:regionPlains -2.041e-07 1.023e-08 -19.959 < 2e-16 ***
## total_volume:regionSouthCentral 2.063e-08 4.234e-09 4.871 1.17e-06 ***
## total_volume:regionSoutheast -2.879e-08 5.607e-09 -5.134 3.03e-07 ***
## total_volume:regionWest 5.807e-09 4.138e-09 1.403 0.160674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1645 on 2682 degrees of freedom
## Multiple R-squared: 0.7804, Adjusted R-squared: 0.7787
## F-statistic: 454 on 21 and 2682 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_5b)
summary(model_5c)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season +
## year + small_hass_percent, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.73312 -0.10642 0.00458 0.11190 0.81680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.830e+00 1.587e-02 115.295 < 2e-16 ***
## total_volume -9.938e-08 1.882e-09 -52.801 < 2e-16 ***
## regionGreatLakes -2.269e-01 1.492e-02 -15.206 < 2e-16 ***
## regionMidsouth -1.781e-01 1.457e-02 -12.219 < 2e-16 ***
## regionNortheast 6.422e-02 1.557e-02 4.124 3.83e-05 ***
## regionPlains -1.764e-01 1.428e-02 -12.351 < 2e-16 ***
## regionSouthCentral -2.731e-01 1.449e-02 -18.849 < 2e-16 ***
## regionSoutheast -1.054e-01 1.441e-02 -7.318 3.32e-13 ***
## regionWest -1.093e-01 1.387e-02 -7.882 4.65e-15 ***
## seasonspring -1.581e-01 1.001e-02 -15.793 < 2e-16 ***
## seasonsummer -4.196e-02 1.021e-02 -4.110 4.08e-05 ***
## seasonwinter -2.122e-01 1.006e-02 -21.089 < 2e-16 ***
## year2016 -3.573e-02 9.303e-03 -3.841 0.000126 ***
## year2017 1.314e-01 9.354e-03 14.043 < 2e-16 ***
## year2018 1.260e-01 1.552e-02 8.114 7.38e-16 ***
## small_hass_percent -1.963e-03 3.166e-04 -6.202 6.42e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.18 on 2688 degrees of freedom
## Multiple R-squared: 0.7365, Adjusted R-squared: 0.735
## F-statistic: 500.8 on 15 and 2688 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_5c)
summary(model_5d)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season +
## year + total_volume:season, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69131 -0.10511 0.00172 0.10644 0.77955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.830e+00 1.461e-02 125.209 < 2e-16 ***
## total_volume -1.347e-07 3.723e-09 -36.192 < 2e-16 ***
## regionGreatLakes -1.967e-01 1.387e-02 -14.177 < 2e-16 ***
## regionMidsouth -1.561e-01 1.393e-02 -11.202 < 2e-16 ***
## regionNortheast 1.066e-01 1.380e-02 7.722 1.60e-14 ***
## regionPlains -1.876e-01 1.411e-02 -13.290 < 2e-16 ***
## regionSouthCentral -2.994e-01 1.373e-02 -21.803 < 2e-16 ***
## regionSoutheast -1.290e-01 1.386e-02 -9.308 < 2e-16 ***
## regionWest -1.055e-01 1.373e-02 -7.682 2.17e-14 ***
## seasonspring -2.398e-01 1.339e-02 -17.902 < 2e-16 ***
## seasonsummer -8.668e-02 1.376e-02 -6.301 3.45e-10 ***
## seasonwinter -2.733e-01 1.320e-02 -20.710 < 2e-16 ***
## year2016 -1.918e-02 8.760e-03 -2.189 0.0286 *
## year2017 1.496e-01 8.723e-03 17.145 < 2e-16 ***
## year2018 1.466e-01 1.488e-02 9.852 < 2e-16 ***
## total_volume:seasonspring 4.097e-08 4.547e-09 9.012 < 2e-16 ***
## total_volume:seasonsummer 2.522e-08 4.754e-09 5.306 1.21e-07 ***
## total_volume:seasonwinter 3.307e-08 4.451e-09 7.431 1.45e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1785 on 2686 degrees of freedom
## Multiple R-squared: 0.741, Adjusted R-squared: 0.7394
## F-statistic: 452.1 on 17 and 2686 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_5d)
anova(model_4e, model_5b)
## Analysis of Variance Table
##
## Model 1: average_price ~ total_volume + region + season + year
## Model 2: average_price ~ total_volume + region + season + year + total_volume:region
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2689 88.327
## 2 2682 72.547 7 15.779 83.335 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It is significant despite some of the interactions having a high p value
R squared now up to 0.7804 and res std error down to 0.1645
drifting of both tails on Q-Q plot
#6th predictor
model_6a <- lm(average_price ~ total_volume + region + season + year + total_volume:region + small_hass_percent, data = avocados_cleaned)
model_6b <- lm(average_price ~ total_volume + region + season + year + total_volume:region + organic, data = avocados_cleaned)
model_6c <- lm(average_price ~ total_volume + region + season + year + total_volume:region + total_volume:season, data = avocados_cleaned)
summary(model_6a)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season +
## year + total_volume:region + small_hass_percent, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.70211 -0.08781 0.00337 0.09190 0.87630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.665e+00 1.877e-02 88.701 < 2e-16 ***
## total_volume -1.079e-07 3.113e-09 -34.654 < 2e-16 ***
## regionGreatLakes -1.299e-01 1.941e-02 -6.691 2.68e-11 ***
## regionMidsouth -3.442e-02 1.938e-02 -1.776 0.075863 .
## regionNortheast 2.175e-01 1.903e-02 11.430 < 2e-16 ***
## regionPlains 5.607e-02 1.864e-02 3.007 0.002659 **
## regionSouthCentral -3.967e-01 1.846e-02 -21.486 < 2e-16 ***
## regionSoutheast -6.314e-02 1.764e-02 -3.579 0.000351 ***
## regionWest -1.200e-01 1.794e-02 -6.693 2.66e-11 ***
## seasonspring -1.557e-01 9.063e-03 -17.178 < 2e-16 ***
## seasonsummer -3.688e-02 9.244e-03 -3.990 6.79e-05 ***
## seasonwinter -2.074e-01 9.107e-03 -22.769 < 2e-16 ***
## year2016 1.235e-02 8.821e-03 1.400 0.161507
## year2017 1.843e-01 8.944e-03 20.604 < 2e-16 ***
## year2018 1.996e-01 1.458e-02 13.690 < 2e-16 ***
## small_hass_percent 2.857e-03 3.908e-04 7.311 3.48e-13 ***
## total_volume:regionGreatLakes -4.321e-09 6.022e-09 -0.718 0.473104
## total_volume:regionMidsouth -4.803e-08 6.923e-09 -6.937 4.99e-12 ***
## total_volume:regionNortheast -1.876e-08 5.364e-09 -3.498 0.000477 ***
## total_volume:regionPlains -2.547e-07 1.226e-08 -20.767 < 2e-16 ***
## total_volume:regionSouthCentral 1.969e-08 4.195e-09 4.693 2.82e-06 ***
## total_volume:regionSoutheast -4.719e-08 6.097e-09 -7.740 1.39e-14 ***
## total_volume:regionWest 6.220e-09 4.099e-09 1.517 0.129269
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1629 on 2681 degrees of freedom
## Multiple R-squared: 0.7847, Adjusted R-squared: 0.783
## F-statistic: 444.3 on 22 and 2681 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_6a)
summary(model_6b)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season +
## year + total_volume:region + organic, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.70882 -0.08823 0.00450 0.09320 0.84389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.697e+00 2.577e-02 65.860 < 2e-16 ***
## total_volume -9.026e-08 4.935e-09 -18.288 < 2e-16 ***
## regionGreatLakes -1.849e-01 1.807e-02 -10.233 < 2e-16 ***
## regionMidsouth -8.890e-02 1.811e-02 -4.910 9.68e-07 ***
## regionNortheast 1.699e-01 1.806e-02 9.403 < 2e-16 ***
## regionPlains 1.590e-02 1.796e-02 0.885 0.376093
## regionSouthCentral -3.613e-01 1.796e-02 -20.113 < 2e-16 ***
## regionSoutheast -6.592e-02 1.780e-02 -3.704 0.000217 ***
## regionWest -1.254e-01 1.808e-02 -6.935 5.09e-12 ***
## seasonspring -1.614e-01 9.550e-03 -16.904 < 2e-16 ***
## seasonsummer -4.292e-02 9.618e-03 -4.462 8.44e-06 ***
## seasonwinter -2.143e-01 9.489e-03 -22.579 < 2e-16 ***
## year2016 -1.775e-02 8.137e-03 -2.182 0.029220 *
## year2017 1.513e-01 8.122e-03 18.622 < 2e-16 ***
## year2018 1.511e-01 1.418e-02 10.661 < 2e-16 ***
## organicTRUE 6.360e-02 2.432e-02 2.615 0.008973 **
## total_volume:regionGreatLakes 6.199e-09 6.779e-09 0.914 0.360569
## total_volume:regionMidsouth -2.705e-08 8.092e-09 -3.343 0.000839 ***
## total_volume:regionNortheast -2.254e-08 5.591e-09 -4.031 5.71e-05 ***
## total_volume:regionPlains -1.807e-07 1.358e-08 -13.304 < 2e-16 ***
## total_volume:regionSouthCentral 2.087e-08 4.230e-09 4.934 8.53e-07 ***
## total_volume:regionSoutheast -2.300e-08 6.022e-09 -3.820 0.000137 ***
## total_volume:regionWest 5.515e-09 4.135e-09 1.334 0.182423
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1643 on 2681 degrees of freedom
## Multiple R-squared: 0.781, Adjusted R-squared: 0.7792
## F-statistic: 434.6 on 22 and 2681 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_6b)
summary(model_6c)
##
## Call:
## lm(formula = average_price ~ total_volume + region + season +
## year + total_volume:region + total_volume:season, data = avocados_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.68088 -0.08304 -0.00354 0.09324 0.80282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.803e+00 1.549e-02 116.435 < 2e-16 ***
## total_volume -1.275e-07 4.268e-09 -29.884 < 2e-16 ***
## regionGreatLakes -1.870e-01 1.779e-02 -10.511 < 2e-16 ***
## regionMidsouth -9.098e-02 1.783e-02 -5.103 3.59e-07 ***
## regionNortheast 1.684e-01 1.779e-02 9.467 < 2e-16 ***
## regionPlains 1.061e-02 1.769e-02 0.600 0.5486
## regionSouthCentral -3.608e-01 1.769e-02 -20.401 < 2e-16 ***
## regionSoutheast -6.999e-02 1.751e-02 -3.997 6.58e-05 ***
## regionWest -1.239e-01 1.780e-02 -6.957 4.34e-12 ***
## seasonspring -2.295e-01 1.215e-02 -18.895 < 2e-16 ***
## seasonsummer -7.760e-02 1.247e-02 -6.221 5.72e-10 ***
## seasonwinter -2.652e-01 1.197e-02 -22.162 < 2e-16 ***
## year2016 -1.642e-02 7.941e-03 -2.067 0.0388 *
## year2017 1.529e-01 7.908e-03 19.340 < 2e-16 ***
## year2018 1.565e-01 1.350e-02 11.594 < 2e-16 ***
## total_volume:regionGreatLakes -1.392e-09 5.970e-09 -0.233 0.8156
## total_volume:regionMidsouth -3.759e-08 6.748e-09 -5.569 2.81e-08 ***
## total_volume:regionNortheast -2.684e-08 5.198e-09 -5.163 2.61e-07 ***
## total_volume:regionPlains -2.024e-07 1.006e-08 -20.115 < 2e-16 ***
## total_volume:regionSouthCentral 2.062e-08 4.166e-09 4.950 7.87e-07 ***
## total_volume:regionSoutheast -2.867e-08 5.515e-09 -5.199 2.16e-07 ***
## total_volume:regionWest 5.425e-09 4.071e-09 1.333 0.1827
## total_volume:seasonspring 3.871e-08 4.122e-09 9.391 < 2e-16 ***
## total_volume:seasonsummer 2.359e-08 4.309e-09 5.473 4.83e-08 ***
## total_volume:seasonwinter 3.105e-08 4.037e-09 7.693 2.00e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1618 on 2679 degrees of freedom
## Multiple R-squared: 0.7879, Adjusted R-squared: 0.786
## F-statistic: 414.6 on 24 and 2679 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model_6c)